home *** CD-ROM | disk | FTP | other *** search
/ Skunkware 5 / Skunkware 5.iso / src / X11 / wais / ir / irfiles.h < prev    next >
C/C++ Source or Header  |  1995-05-09  |  7KB  |  225 lines

  1. /* WIDE AREA INFORMATION SERVER SOFTWARE:
  2.    No guarantees or restrictions.  See the readme file for the full standard
  3.    disclaimer.
  4.  
  5.    Brewster@think.com
  6.  *
  7.  * $Log:    irfiles.h,v $
  8.  * Revision 1.19  92/04/16  20:04:44  morris
  9.  * small fix to dictionary_blockword_occurances, lenght read was
  10.  * NEXT_INDEX_BLOCK_SIZE, now its NUMBR_OF_OCCURANCES_SIZE.
  11.  * 
  12.  * Revision 1.18  92/03/19  09:34:08  morris
  13.  * fixed the dictionary header to accurately indicate the number of blocks
  14.  * 
  15.  * Revision 1.17  92/02/17  12:38:00  jonathan
  16.  * Added defines for catalog.
  17.  * 
  18.  */
  19.  
  20. /* include file for irfiles.c */
  21.  
  22. #ifndef IRFILES_H
  23. #define IRFILES_H
  24.  
  25. #include "cdialect.h"
  26. #include "cutil.h"
  27. #include "hash.h"
  28. #include "ustubs.h" /* for time_t */
  29.  
  30. /* filename extensions for various components */
  31. #define dictionary_ext            ".dct"
  32. #define filename_table_ext        ".fn"
  33. #define headline_table_ext        ".hl"
  34. #define document_table_ext        ".doc"
  35. #define index_ext            ".inv"
  36. #define source_ext             ".src"
  37. #define catalog_ext             ".cat"
  38.  
  39. /* these dictionary definitions are used in irhash,irverify, and irfiles */
  40. #define DICTIONARY_HEADER_SIZE 4
  41. #define DICTIONARY_BLOCK_SIZE 1000L  /* in entries, not bytes */
  42. #define DICTIONARY_ENTRY_HASH_CODE_SIZE 2
  43. /* #define DICTIONARY_ENTRY_COUNT_SIZE 3  moved to inverted file */
  44. /* #define DICTIONARY_ENTRY_INDEX_BLOCK_SIZE 4 not used and too long a symbol*/
  45. /* #define DICTIONARY_ELEMENT_SIZE 6 was 9 */
  46. #define DICTIONARY_SIZE 524288L
  47. #define DICTIONARY_TOTAL_SIZE_WORD "{}" /* the word that holds the total number of words in the whole dictionary */
  48.  
  49. #define INDEX_HEADER_SIZE 4
  50. #define INDEX_BLOCK_SIZE_SIZE 2
  51. #define NEXT_INDEX_BLOCK_SIZE 4
  52. #define INDEX_BLOCK_FLAG_SIZE 1
  53. #define INDEX_BLOCK_HEADER_SIZE 7
  54. #define NUMBER_OF_OCCURANCES_SIZE 4
  55. #define INDEX_BLOCK_NOT_FULL_FLAG 101
  56. #define INDEX_BLOCK_FULL_FLAG 69
  57. #define INDEX_BLOCK_DICTIONARY_FLAG 123
  58.  
  59. #define DOCUMENT_ID_SIZE 4
  60. #define WORD_POSITION_SIZE 0
  61. #define CHARACTER_POSITION_SIZE 3
  62. #define WEIGHT_SIZE 1
  63. #define INDEX_ELEMENT_SIZE 8
  64. #define WORD_ID_SIZE 4 /* for posting arrays */
  65.  
  66. typedef struct database {
  67.     char*    database_file;
  68.     FILE*    dictionary_stream;
  69.     FILE*    filename_table_stream;
  70.     FILE*    headline_table_stream;
  71.     FILE*    document_table_stream;
  72.     FILE*    index_stream;
  73.     long    doc_table_allocated_entries;
  74.     hashtable* the_word_memory_hashtable;
  75.  
  76.     long     number_of_words_in_hashtable; /* for building.
  77.                          checked on every add_word.
  78.                            set at start of building,
  79.                            and on every flush.*/
  80.     long     flush_after_n_words; /* set at the start of building used
  81.                     to compare with 
  82.                     number_of_words_in_hashtable. */
  83.     long     number_of_words; /* for building.  number of different words.
  84.                     Set from the headers of .inv files
  85.                     as they are merged. 
  86.                     It is used to set the header when a .inv 
  87.                     file is first created (not by merging).
  88.                     */
  89.     long    index_file_number; /* for building. */
  90.     long    total_word_count; /* Total number of word occurances.
  91.                      set during indexing, saved in 
  92.                      dictionary under 'ALL' entry */
  93.     void*   ext_database;
  94. } database;
  95.  
  96. typedef struct document_table_entry {
  97.     long    filename_id;
  98.     long    headline_id;
  99.     long    source_id;    /* for signature system */
  100.     long    start_character;
  101.     long    end_character;
  102.     long     document_length; /* in characters */
  103.     long    number_of_lines; /* in lines */
  104.     time_t  date;            /* 0 if unknown */
  105. } document_table_entry;
  106.  
  107. #ifdef __cplusplus
  108. /* declare these as C style functions */
  109. extern "C"
  110.     {
  111. #endif /* def __cplusplus */
  112.  
  113. database*     openDatabase _AP((char* name, boolean initialize,boolean for_search));
  114. void        closeDatabase _AP((database* the_db));
  115. void        disposeDatabase _AP((database* the_db));
  116.  
  117. void initialize_index_files _AP((database* db));
  118.  
  119. char *read_filename_table_entry _AP((long position, 
  120.                   char* filename,
  121.                   char* type, 
  122.                   time_t* file_write_date,
  123.                   database* db));
  124.  
  125. long write_filename_table_entry _AP((char* filename, char *type, database* db));
  126. boolean filename_in_database _AP((char *filename, char *type,
  127.                   time_t *write_file_date, database *db));
  128. boolean filename_in_filename_file _AP ((char *filename, char*type,
  129.                   time_t *file_write_date, 
  130.                   char* filename_file));
  131. char *read_headline_table_entry _AP((long position,database* db));
  132. long write_headline_table_entry _AP((char* headline, database* db));
  133.  
  134.  
  135. boolean read_document_table_entry 
  136.   _AP((document_table_entry* doc_entry,long number,database* db));
  137.  
  138. long write_document_table_entry
  139.   _AP((document_table_entry* doc_table_entry, database* db));
  140.  
  141. boolean writeUserValToDocIDTable _AP((unsigned long userVal,long doc,
  142.                       database* db));
  143.  
  144.  
  145. long next_document_id _AP((database* db));
  146.  
  147.  
  148. void close_dictionary_file _AP((database *db));
  149.  
  150. long add_word_to_dictionary
  151.      _AP((char *word, long index_file_block_number, long number_of_occurances,
  152.      database* db));
  153. long look_up_word_in_dictionary _AP((char *word, long *word_id, database* db));
  154. long init_dict_file_for_writing _AP((database *db));
  155. void init_dict_file_detailed _AP((FILE* dictionary_stream,
  156.                   long number_of_blocks));
  157. void record_num_blocks_in_dict _AP((FILE* dictionary_stream,
  158.                     long number_of_words));
  159.  
  160. long finished_add_word_to_dictionary _AP((database *db));
  161.  
  162. boolean register_src_structure _AP((char *filename));
  163. boolean write_src_structure _AP((char *filename, 
  164.                  char *database_name, 
  165.                  char *typename,
  166.                  char **filenames, 
  167.                  long number_of_filename,
  168.                  boolean export_database,
  169.                  long tcp_port));
  170.  
  171. boolean build_catalog _AP((database* db));
  172.  
  173. long allocate_index_block _AP((long how_large, FILE* stream));
  174.  
  175. unsigned char *read_dictionary_block _AP((unsigned char* block,
  176.                       long position,long length,
  177.                       FILE* stream));
  178.                                       
  179. void print_dictionary _AP((database* db));
  180.  
  181. #define DICTIONARY_ENTRY_SIZE 29 /* sum of MAX_WORD_LENGTH, 1 ('\0'), 
  182.                     NEXT_INDEX_BLOCK_SIZE and
  183.                     NUMBER_OF_OCCURANCES_SIZE */
  184.  
  185.  
  186. #ifdef DICT_FUNC
  187.  
  188. char *dictionary_block_word _AP((long i,unsigned char* block));
  189. long dictionary_block_position _AP((long i,unsigned char* block));
  190. long dictionary_block_word_occurances _AP((long i,unsigned char* block));
  191.  
  192. #else /* macros */
  193.  
  194. #define dictionary_block_word(i,block) \
  195.   ((char *)((block) + ((i) * DICTIONARY_ENTRY_SIZE)))
  196.  
  197. #define dictionary_block_position(i,block) \
  198.   read_bytes_from_memory(NEXT_INDEX_BLOCK_SIZE, \
  199.              (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
  200.               MAX_WORD_LENGTH + 1)
  201.  
  202. #define dictionary_block_word_occurances(i,block) \
  203.   read_bytes_from_memory(NUMBER_OF_OCCURANCES_SIZE, \
  204.                          (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
  205.                          MAX_WORD_LENGTH + 1 + NEXT_INDEX_BLOCK_SIZE)
  206. #endif
  207.  
  208. void print_dictionary_block _AP((unsigned char* block,long size));
  209.  
  210. /* database functions */
  211. char* dictionary_filename _AP((char* destination, database* db));
  212. char* filename_table_filename _AP((char* destination, database* db));
  213. char* headline_table_filename _AP((char* destination, database* db));
  214. char* document_table_filename _AP((char* destination, database* db));
  215. char* index_filename _AP((char* destination, database* db));
  216. char* index_filename_with_version _AP((long version, char* destination, 
  217.                   database* db));
  218. char* source_filename _AP((char* destination, database* db));
  219.  
  220. #ifdef __cplusplus
  221.     }
  222. #endif /* def __cplusplus */
  223.  
  224. #endif /* IRFILES_H */
  225.